library(here)
library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
library(scales)
# Set theme for all plots
theme_set(theme_bw(base_size = 12))
# Create output directory
output_dir <- params$output_dir
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
# Helper function to save figures in both PNG and PDF formats
save_figure <- function(plot, filename, width = 10, height = 8) {
ggsave(file.path(output_dir, paste0(filename, ".png")), plot,
width = width, height = height, dpi = 300)
ggsave(file.path(output_dir, paste0(filename, ".pdf")), plot,
width = width, height = height)
cat("Saved:", filename, ".png/.pdf\n")
}
# Load preprocessed count data (created by 00_download_and_preprocess.R)
counts_file <- here("data", "processed", "mirna_counts_filtered.csv")
metadata_file <- here("data", "metadata", "sample_metadata.csv")
if (!file.exists(counts_file)) {
stop("Preprocessed counts not found. Run 00_download_and_preprocess.R first.")
}
# Load data
count_data <- read_csv(counts_file, show_col_types = FALSE)
sample_metadata <- read_csv(metadata_file, show_col_types = FALSE)
# Convert to matrix with Gene_ID as rownames
raw_counts <- as.data.frame(count_data[, -1]) # remove Gene_ID column
rownames(raw_counts) <- count_data$Gene_ID
# Create sample mapping for compatibility with downstream code
sample_mapping <- sample_metadata %>%
dplyr::select(Sample, Treatment_Group) %>%
as.data.frame()
rownames(sample_mapping) <- sample_mapping$Sample
names(sample_mapping)[names(sample_mapping) == "Treatment_Group"] <- "comparison_7"
# Define sample groups for analysis
analysis_groups <- c("CF_Asp_Neg", "HC", "UNSTIM")
samples_to_analyze <- sample_metadata %>%
dplyr::filter(Treatment_Group %in% analysis_groups) %>%
pull(Sample)
# Filter to samples in analysis
samples_to_analyze <- intersect(samples_to_analyze, colnames(raw_counts))
cat("Total samples in count data:", ncol(raw_counts), "\n")
## Total samples in count data: 78
cat("Total samples for analysis:", length(samples_to_analyze), "\n")
## Total samples for analysis: 46
cat("Groups included:", paste(analysis_groups, collapse = ", "), "\n")
## Groups included: CF_Asp_Neg, HC, UNSTIM
# Extract counts for selected samples
raw_counts_subset <- raw_counts[, samples_to_analyze]
# Ensure numeric
raw_counts_subset <- as.data.frame(lapply(raw_counts_subset, as.numeric))
rownames(raw_counts_subset) <- rownames(raw_counts)
# Calculate total reads per sample
sample_totals <- colSums(raw_counts_subset, na.rm = TRUE)
# Calculate fractional abundance (each sample sums to 1)
frac_abundance <- sweep(raw_counts_subset, 2, sample_totals, FUN = "/")
# Verify all samples sum to 1
cat("\nVerification - all samples sum to 1.0:\n")
##
## Verification - all samples sum to 1.0:
print(summary(colSums(frac_abundance)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
cat("\nTotal miRNAs detected:", nrow(frac_abundance), "\n")
##
## Total miRNAs detected: 2083
cat("Total reads across all samples:", format(sum(sample_totals), big.mark = ","), "\n")
## Total reads across all samples: 104,190,306
This replaces everything from the start of section 2 through the end of section 3. The key changes:
data/processed/mirna_counts_filtered.csv
instead of raw GEO Excelfrac_abundance,
sample_mapping, raw_counts) so downstream code
works unchangedAfter making this change, re-render the Rmd and check if the ANOVA p-values now match.
# Calculate mean fractional abundance across all samples
mean_frac_abundance <- rowMeans(frac_abundance)
# Define threshold (1% = 0.01 fractional abundance)
threshold_pct <- 1
threshold_fraction <- threshold_pct / 100
# Identify miRNAs meeting the threshold
major_mirnas <- mean_frac_abundance[mean_frac_abundance >= threshold_fraction]
major_mirnas_sorted <- sort(major_mirnas, decreasing = TRUE)
major_mirna_names <- names(major_mirnas_sorted)
# Create detailed summary table
major_mirnas_table <- data.frame(
Rank = 1:length(major_mirnas_sorted),
miRNA = major_mirna_names,
Mean_Fraction = major_mirnas_sorted,
Percentage = major_mirnas_sorted * 100,
Cumulative_Percentage = cumsum(major_mirnas_sorted) * 100
)
cat("\n=== Major miRNAs (≥1% of Total Reads) ===\n\n")
##
## === Major miRNAs (≥1% of Total Reads) ===
print(major_mirnas_table, row.names = FALSE, digits = 4)
## Rank miRNA Mean_Fraction Percentage Cumulative_Percentage
## 1 miR-6126 0.33169 33.169 33.17
## 2 miR-3197 0.09459 9.459 42.63
## 3 miR-2861 0.05651 5.651 48.28
## 4 miR-6131 0.04833 4.833 53.11
## 5 miR-4417 0.04205 4.205 57.32
## 6 miR-6780b-5p 0.02331 2.331 59.65
## 7 miR-149-3p 0.02273 2.273 61.92
## 8 miR-21-5p 0.01997 1.997 63.92
## 9 miR-6803-5p 0.01771 1.771 65.69
## 10 miR-1237-5p 0.01291 1.291 66.98
## 11 miR-3687 0.01221 1.221 68.20
## 12 miR-1273d 0.01157 1.157 69.36
## 13 miR-3648 0.01012 1.012 70.37
# Summary statistics
total_major_fraction <- sum(major_mirnas_sorted)
remaining_fraction <- 1 - total_major_fraction
n_major <- length(major_mirnas_sorted)
n_total <- nrow(frac_abundance)
cat("\n=== Summary Statistics ===\n")
##
## === Summary Statistics ===
cat("Number of miRNAs with ≥1% abundance:", n_major, "\n")
## Number of miRNAs with ≥1% abundance: 13
cat("These", n_major, "miRNAs collectively represent:",
sprintf("%.2f%%", total_major_fraction * 100), "of all reads\n")
## These 13 miRNAs collectively represent: 70.37% of all reads
cat("Remaining", n_total - n_major, "miRNAs represent:",
sprintf("%.2f%%", remaining_fraction * 100), "of all reads\n")
## Remaining 2070 miRNAs represent: 29.63% of all reads
cat("Concentration ratio (Major / Remaining):",
sprintf("%.2f", total_major_fraction / remaining_fraction), "\n")
## Concentration ratio (Major / Remaining): 2.38
cat("\nMost abundant:", major_mirna_names[1],
"at", sprintf("%.2f%%", major_mirnas_sorted[1] * 100), "\n")
##
## Most abundant: miR-6126 at 33.17%
cat("Least abundant (of major):", major_mirna_names[n_major],
"at", sprintf("%.2f%%", major_mirnas_sorted[n_major] * 100), "\n")
## Least abundant (of major): miR-3648 at 1.01%
# Save summary table
write.csv(major_mirnas_table, file.path(output_dir, "major_miRNAs_1pct_summary.csv"),
row.names = FALSE)
# Calculate appropriate figure height based on number of miRNAs
fig_height <- max(6, n_major * 0.35)
# Prepare data for plotting
major_plot_data <- major_mirnas_table %>%
mutate(miRNA = factor(miRNA, levels = rev(major_mirna_names))) # Reverse for top-down display
# Create bar plot
p1 <- ggplot(major_plot_data, aes(x = miRNA, y = Percentage)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red", size = 0.8) +
geom_text(aes(label = sprintf("%.2f%%", Percentage)),
hjust = -0.1, size = 3.5) +
coord_flip() +
labs(
title = "Major miRNAs (≥1% Average Abundance)",
subtitle = sprintf("%d miRNAs collectively represent %.1f%% of all reads",
n_major, total_major_fraction * 100),
x = "miRNA",
y = "Mean Fractional Abundance (%)"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.y = element_text(face = "bold")
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15)))
print(p1)
save_figure(p1, "major_miRNAs_barplot", width = 10, height = fig_height)
## Saved: major_miRNAs_barplot .png/.pdf
# Rank all miRNAs by abundance
all_ranked <- data.frame(
miRNA = names(mean_frac_abundance),
Mean_Fraction = mean_frac_abundance
) %>%
arrange(desc(Mean_Fraction)) %>%
mutate(
Rank = 1:n(),
Cumulative_Fraction = cumsum(Mean_Fraction),
Cumulative_Percentage = Cumulative_Fraction * 100,
Category = ifelse(Rank <= n_major, "Major (≥1%)", "Others")
)
# Create cumulative plot
p2 <- ggplot(all_ranked, aes(x = Rank, y = Cumulative_Percentage)) +
geom_line(size = 1.2, color = "darkblue") +
geom_point(data = all_ranked[all_ranked$Rank <= n_major, ],
aes(x = Rank, y = Cumulative_Percentage),
color = "red", size = 3) +
geom_hline(yintercept = total_major_fraction * 100,
linetype = "dashed", color = "red", size = 0.8) +
geom_vline(xintercept = n_major, linetype = "dashed", color = "gray50", size = 0.8) +
annotate("text", x = n_major, y = 5,
label = paste("Top", n_major), color = "red", fontface = "bold", hjust = -0.1) +
annotate("text", x = 200, y = total_major_fraction * 100 + 5,
label = sprintf("%.1f%%", total_major_fraction * 100),
color = "red", fontface = "bold") +
labs(
title = "Cumulative Fractional Abundance of miRNAs",
subtitle = paste("Red points indicate major miRNAs (≥1% average abundance)"),
x = "miRNA Rank (by abundance)",
y = "Cumulative Percentage (%)"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14)
) +
scale_x_continuous(breaks = c(1, n_major, 50, 100, 200, 400, 600)) +
scale_y_continuous(breaks = seq(0, 100, 10))
print(p2)
save_figure(p2, "cumulative_abundance_plot", width = 10, height = 6)
## Saved: cumulative_abundance_plot .png/.pdf
# Create simplified composition data
composition_data <- data.frame(
Sample = "All Samples Combined",
Major = total_major_fraction * 100,
Others = remaining_fraction * 100
) %>%
pivot_longer(cols = c(Major, Others),
names_to = "Category",
values_to = "Percentage")
# Create stacked bar
p3 <- ggplot(composition_data, aes(x = Sample, y = Percentage, fill = Category)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = sprintf("%.1f%%", Percentage)),
position = position_stack(vjust = 0.5),
size = 6, fontface = "bold", color = "white") +
scale_fill_manual(
values = c("Major" = "steelblue", "Others" = "gray70"),
labels = c("Major" = paste("Major miRNAs (≥1%, n =", n_major, ")"),
"Others" = "All Other miRNAs")
) +
labs(
title = "Compositional Dominance of Major miRNAs",
y = "Percentage of Total Reads (%)",
x = NULL,
fill = "Category"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
legend.position = "bottom",
legend.title = element_text(face = "bold"),
axis.text.x = element_blank()
) +
scale_y_continuous(breaks = seq(0, 100, 10))
print(p3)
save_figure(p3, "composition_stacked_bar", width = 10, height = 6)
## Saved: composition_stacked_bar .png/.pdf
# Calculate appropriate figure height based on number of miRNAs
fig_height <- max(6, n_major * 0.4)
# Extract major miRNA data for all samples
major_data <- frac_abundance[major_mirna_names, ] * 100 # Convert to percentage
# Add treatment group information
sample_groups <- sample_mapping[colnames(major_data), "comparison_7"]
# Prepare data for heatmap
heatmap_data <- major_data %>%
as.data.frame() %>%
mutate(miRNA = rownames(.)) %>%
pivot_longer(cols = -miRNA, names_to = "Sample", values_to = "Percentage") %>%
mutate(
miRNA = factor(miRNA, levels = major_mirna_names),
Group = sample_mapping[Sample, "comparison_7"]
)
# Create heatmap
p4 <- ggplot(heatmap_data, aes(x = Sample, y = miRNA, fill = Percentage)) +
geom_tile(color = "white", size = 0.5) +
scale_fill_gradient2(
low = "white",
mid = "lightblue",
high = "darkblue",
midpoint = median(heatmap_data$Percentage),
name = "Abundance\n(%)"
) +
labs(
title = "Fractional Abundance of Major miRNAs Across Samples",
subtitle = "miRNAs with ≥1% average abundance",
x = "Sample",
y = "miRNA"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
axis.text.y = element_text(face = "bold"),
legend.position = "right"
) +
facet_grid(. ~ Group, scales = "free_x", space = "free_x")
print(p4)
save_figure(p4, "major_miRNAs_heatmap_by_sample", width = 14, height = fig_height)
## Saved: major_miRNAs_heatmap_by_sample .png/.pdf
# Calculate appropriate figure height based on number of miRNAs
fig_height <- max(6, n_major * 0.6)
# Create boxplot for each miRNA across treatment groups
p5 <- ggplot(heatmap_data, aes(x = Group, y = Percentage, fill = Group)) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 1.5) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
facet_wrap(~ miRNA, scales = "free_y", ncol = 5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Distribution of Major miRNA Abundances by Treatment Group",
subtitle = "miRNAs with ≥1% average abundance",
x = "Treatment Group",
y = "Fractional Abundance (%)"
) +
theme_bw(base_size = 10) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold"),
legend.position = "bottom"
)
print(p5)
save_figure(p5, "major_miRNAs_boxplot_by_group", width = 12, height = fig_height)
## Saved: major_miRNAs_boxplot_by_group .png/.pdf
Testing whether mean abundance of major miRNAs differs significantly between treatment groups using one-way ANOVA with post-hoc pairwise comparisons.
# Prepare data for statistical testing
test_data <- frac_abundance[major_mirna_names, ] %>%
as.data.frame() %>%
mutate(miRNA = rownames(.)) %>%
pivot_longer(cols = -miRNA, names_to = "sample", values_to = "fraction") %>%
mutate(
treatment = sample_mapping[sample, "comparison_7"],
percentage = fraction * 100 # Convert to percentage for easier interpretation
)
# Function to perform ANOVA and Tukey HSD for each miRNA
perform_anova_tests <- function(mirna_name, data) {
mirna_data <- data %>% filter(miRNA == mirna_name)
# Perform one-way ANOVA
anova_result <- aov(percentage ~ treatment, data = mirna_data)
anova_summary <- summary(anova_result)
# Extract p-value
anova_pval <- anova_summary[[1]][["Pr(>F)"]][1]
# Perform Tukey HSD post-hoc test if ANOVA is significant
if(anova_pval < 0.05) {
tukey_result <- TukeyHSD(anova_result)
tukey_comparisons <- as.data.frame(tukey_result$treatment)
tukey_comparisons$comparison <- rownames(tukey_comparisons)
tukey_comparisons$miRNA <- mirna_name
} else {
tukey_comparisons <- NULL
}
# Calculate mean abundance per treatment
treatment_means <- mirna_data %>%
group_by(treatment) %>%
summarise(mean_pct = mean(percentage),
sd_pct = sd(percentage),
n = n(),
.groups = 'drop')
return(list(
miRNA = mirna_name,
anova_pval = anova_pval,
tukey_comparisons = tukey_comparisons,
treatment_means = treatment_means
))
}
# Run ANOVA tests for all major miRNAs
anova_results <- lapply(major_mirna_names, perform_anova_tests, data = test_data)
# Create summary table of ANOVA results
anova_summary_table <- data.frame(
miRNA = sapply(anova_results, function(x) x$miRNA),
ANOVA_P_Value = sapply(anova_results, function(x) x$anova_pval),
Significant = sapply(anova_results, function(x) ifelse(x$anova_pval < 0.05, "Yes", "No"))
)
# Adjust for multiple testing using BH correction
anova_summary_table$BH_Adjusted_P <- p.adjust(anova_summary_table$ANOVA_P_Value,
method = "BH")
anova_summary_table$Significant_After_Correction <- ifelse(
anova_summary_table$BH_Adjusted_P < 0.05, "Yes", "No"
)
# Sort by p-value
anova_summary_table <- anova_summary_table %>% arrange(ANOVA_P_Value)
cat("\n=== ANOVA Results for Major miRNAs ===\n")
##
## === ANOVA Results for Major miRNAs ===
cat("Testing for differences in mean abundance between treatment groups\n")
## Testing for differences in mean abundance between treatment groups
cat("(UNSTIM vs HC vs CF)\n\n")
## (UNSTIM vs HC vs CF)
print(anova_summary_table, row.names = FALSE, digits = 4)
## miRNA ANOVA_P_Value Significant BH_Adjusted_P Significant_After_Correction
## miR-6126 0.0003900 Yes 0.00294 Yes
## miR-6803-5p 0.0004523 Yes 0.00294 Yes
## miR-2861 0.0192520 Yes 0.08185 No
## miR-21-5p 0.0251845 Yes 0.08185 No
## miR-3648 0.0316774 Yes 0.08236 No
## miR-3197 0.0520371 No 0.09594 No
## miR-1237-5p 0.0628196 No 0.09594 No
## miR-149-3p 0.0662925 No 0.09594 No
## miR-6780b-5p 0.0664177 No 0.09594 No
## miR-3687 0.1252155 No 0.16114 No
## miR-1273d 0.1363509 No 0.16114 No
## miR-6131 0.2982792 No 0.32314 No
## miR-4417 0.3282540 No 0.32825 No
# Count significant results
n_sig_raw <- sum(anova_summary_table$ANOVA_P_Value < 0.05)
n_sig_corrected <- sum(anova_summary_table$BH_Adjusted_P < 0.05)
cat("\n=== Summary ===\n")
##
## === Summary ===
cat("Total miRNAs tested:", n_major, "\n")
## Total miRNAs tested: 13
cat("Significant at α = 0.05 (uncorrected):", n_sig_raw, "\n")
## Significant at α = 0.05 (uncorrected): 5
cat("Significant after BH correction:", n_sig_corrected, "\n")
## Significant after BH correction: 2
# Save ANOVA results
write.csv(anova_summary_table, file.path(output_dir, "major_miRNAs_ANOVA_results.csv"),
row.names = FALSE)
# Extract and display post-hoc comparisons for significant miRNAs
if(n_sig_raw > 0) {
cat("\n=== Post-hoc Pairwise Comparisons (Tukey HSD) ===\n")
cat("For miRNAs with significant ANOVA results (p < 0.05)\n\n")
significant_mirnas <- anova_summary_table$miRNA[anova_summary_table$ANOVA_P_Value < 0.05]
posthoc_results <- data.frame()
for(mirna in significant_mirnas) {
result <- anova_results[[which(sapply(anova_results, function(x) x$miRNA) == mirna)]]
if(!is.null(result$tukey_comparisons)) {
tukey_df <- result$tukey_comparisons %>%
dplyr::select(miRNA, comparison, diff, `p adj`) %>%
mutate(
Significant = ifelse(`p adj` < 0.05, "Yes", "No")
)
posthoc_results <- rbind(posthoc_results, tukey_df)
}
}
if(nrow(posthoc_results) > 0) {
colnames(posthoc_results) <- c("miRNA", "Comparison", "Mean_Difference",
"Tukey_P_Value", "Significant")
print(posthoc_results, row.names = FALSE, digits = 4)
write.csv(posthoc_results, file.path(output_dir, "major_miRNAs_posthoc_comparisons.csv"),
row.names = FALSE)
}
# Create a summary of treatment means for significant miRNAs
cat("\n=== Mean Abundances by Treatment (Significant miRNAs Only) ===\n\n")
for(mirna in significant_mirnas) {
result <- anova_results[[which(sapply(anova_results, function(x) x$miRNA) == mirna)]]
cat(mirna, ":\n")
print(result$treatment_means, row.names = FALSE, digits = 3)
cat("\n")
}
} else {
cat("\nNo miRNAs showed significant differences between treatment groups at α = 0.05\n")
}
##
## === Post-hoc Pairwise Comparisons (Tukey HSD) ===
## For miRNAs with significant ANOVA results (p < 0.05)
##
## miRNA Comparison Mean_Difference Tukey_P_Value Significant
## miR-6126 HC-CF_Asp_Neg 5.4137 0.5406546 No
## miR-6126 UNSTIM-CF_Asp_Neg 20.6747 0.0003695 Yes
## miR-6126 UNSTIM-HC 15.2609 0.0121355 Yes
## miR-6803-5p HC-CF_Asp_Neg -1.1744 0.0503104 No
## miR-6803-5p UNSTIM-CF_Asp_Neg -2.0049 0.0002930 Yes
## miR-6803-5p UNSTIM-HC -0.8304 0.2114420 No
## miR-2861 HC-CF_Asp_Neg -1.0164 0.8367911 No
## miR-2861 UNSTIM-CF_Asp_Neg -4.8332 0.0201599 Yes
## miR-2861 UNSTIM-HC -3.8169 0.0936520 No
## miR-21-5p HC-CF_Asp_Neg 1.7913 0.0656208 No
## miR-21-5p UNSTIM-CF_Asp_Neg -0.2461 0.9424930 No
## miR-21-5p UNSTIM-HC -2.0374 0.0315782 Yes
## miR-3648 HC-CF_Asp_Neg -0.9553 0.1172778 No
## miR-3648 UNSTIM-CF_Asp_Neg -1.1766 0.0343769 Yes
## miR-3648 UNSTIM-HC -0.2213 0.8855847 No
##
## === Mean Abundances by Treatment (Significant miRNAs Only) ===
##
## miR-6126 :
## # A tibble: 3 × 4
## treatment mean_pct sd_pct n
## <chr> <dbl> <dbl> <int>
## 1 CF_Asp_Neg 24.3 14.6 16
## 2 HC 29.7 17.9 14
## 3 UNSTIM 45.0 7.79 16
##
## miR-6803-5p :
## # A tibble: 3 × 4
## treatment mean_pct sd_pct n
## <chr> <dbl> <dbl> <int>
## 1 CF_Asp_Neg 2.83 2.07 16
## 2 HC 1.65 0.851 14
## 3 UNSTIM 0.821 0.334 16
##
## miR-2861 :
## # A tibble: 3 × 4
## treatment mean_pct sd_pct n
## <chr> <dbl> <dbl> <int>
## 1 CF_Asp_Neg 7.64 5.57 16
## 2 HC 6.62 6.40 14
## 3 UNSTIM 2.81 1.28 16
##
## miR-21-5p :
## # A tibble: 3 × 4
## treatment mean_pct sd_pct n
## <chr> <dbl> <dbl> <int>
## 1 CF_Asp_Neg 1.54 1.71 16
## 2 HC 3.33 3.34 14
## 3 UNSTIM 1.29 0.570 16
##
## miR-3648 :
## # A tibble: 3 × 4
## treatment mean_pct sd_pct n
## <chr> <dbl> <dbl> <int>
## 1 CF_Asp_Neg 1.71 2.12 16
## 2 HC 0.756 0.474 14
## 3 UNSTIM 0.535 0.257 16
if(n_sig_raw > 0) {
# Get list of significant miRNAs
significant_mirnas <- anova_summary_table$miRNA[anova_summary_table$ANOVA_P_Value < 0.05]
# Filter data for significant miRNAs only
sig_plot_data <- test_data %>%
filter(miRNA %in% significant_mirnas) %>%
mutate(
miRNA = factor(miRNA, levels = significant_mirnas),
treatment = factor(treatment, levels = c("UNSTIM", "HC", "CF_Asp_Neg"),
labels = c("UNSTIM", "HC", "CF"))
)
# Prepare annotation data from Tukey HSD results
annotation_data <- data.frame()
for(mirna in significant_mirnas) {
result <- anova_results[[which(sapply(anova_results, function(x) x$miRNA) == mirna)]]
if(!is.null(result$tukey_comparisons)) {
tukey_df <- result$tukey_comparisons
max_val <- max(sig_plot_data$percentage[sig_plot_data$miRNA == mirna])
comparisons <- tukey_df$comparison
p_values <- tukey_df$`p adj`
for(i in 1:length(comparisons)) {
if(p_values[i] < 0.05) {
comp <- comparisons[i]
groups <- strsplit(comp, "-")[[1]]
groups <- recode(groups, "CF_Asp_Neg" = "CF")
if(p_values[i] < 0.001) {
sig_label <- "***"
} else if(p_values[i] < 0.01) {
sig_label <- "**"
} else {
sig_label <- "*"
}
annotation_data <- rbind(annotation_data, data.frame(
miRNA = mirna,
comparison = comp,
group1 = groups[1],
group2 = groups[2],
p_value = p_values[i],
label = sig_label,
y_position = max_val,
stringsAsFactors = FALSE
))
}
}
}
}
# Create base plot
p_sig <- ggplot(sig_plot_data, aes(x = treatment, y = percentage, fill = treatment)) +
geom_boxplot(alpha = 0.7, outlier.shape = 16, outlier.size = 1.5) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
fill = "red", color = "darkred") +
facet_wrap(~ miRNA, scales = "free_y", ncol = 4) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Major miRNAs with Significant Treatment Differences",
subtitle = "Red diamonds = group means; * p<0.05, ** p<0.01, *** p<0.001 (Tukey HSD)",
x = "Treatment Group",
y = "Fractional Abundance (%)",
fill = "Treatment"
) +
theme_bw(base_size = 11) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold", size = 10),
strip.background = element_rect(fill = "grey95"),
legend.position = "bottom"
)
# Add significance annotations if any exist
if(nrow(annotation_data) > 0) {
annotation_data_positioned <- annotation_data %>%
group_by(miRNA) %>%
mutate(
max_y = max(sig_plot_data$percentage[sig_plot_data$miRNA == miRNA]),
bracket_y = max_y + (max_y * 0.05 * row_number())
) %>%
ungroup()
treatment_positions <- c("UNSTIM" = 1, "HC" = 2, "CF" = 3)
annotation_data_positioned$x1 <- treatment_positions[annotation_data_positioned$group1]
annotation_data_positioned$x2 <- treatment_positions[annotation_data_positioned$group2]
annotation_data_positioned$x_mid <- (annotation_data_positioned$x1 +
annotation_data_positioned$x2) / 2
p_sig <- p_sig +
geom_segment(data = annotation_data_positioned,
aes(x = x1, xend = x1, y = bracket_y, yend = bracket_y * 1.02),
inherit.aes = FALSE, color = "black") +
geom_segment(data = annotation_data_positioned,
aes(x = x1, xend = x2, y = bracket_y * 1.02, yend = bracket_y * 1.02),
inherit.aes = FALSE, color = "black") +
geom_segment(data = annotation_data_positioned,
aes(x = x2, xend = x2, y = bracket_y * 1.02, yend = bracket_y),
inherit.aes = FALSE, color = "black") +
geom_text(data = annotation_data_positioned,
aes(x = x_mid, y = bracket_y * 1.04, label = label),
inherit.aes = FALSE, size = 4, fontface = "bold")
}
print(p_sig)
save_figure(p_sig, "major_miRNAs_significant_differences",
width = 12, height = max(6, n_sig_raw*1.5))
cat("\nSignificance annotations show Tukey HSD post-hoc test results:\n")
cat(" * p < 0.05\n")
cat(" ** p < 0.01\n")
cat("*** p < 0.001\n")
} else {
cat("No significant differences detected; skipping visualization.\n")
}
## Saved: major_miRNAs_significant_differences .png/.pdf
##
## Significance annotations show Tukey HSD post-hoc test results:
## * p < 0.05
## ** p < 0.01
## *** p < 0.001
# Create comprehensive summary table
summary_by_treatment <- test_data %>%
group_by(miRNA, treatment) %>%
summarise(
Mean_Percentage = mean(percentage),
SD = sd(percentage),
N = n(),
SE = sd(percentage) / sqrt(n()),
.groups = 'drop'
) %>%
pivot_wider(
names_from = treatment,
values_from = c(Mean_Percentage, SD, SE),
names_glue = "{treatment}_{.value}"
) %>%
left_join(
anova_summary_table %>% dplyr::select(miRNA, ANOVA_P_Value, BH_Adjusted_P),
by = "miRNA"
) %>%
arrange(ANOVA_P_Value)
cat("\n=== Mean Abundance by Treatment Group ===\n")
##
## === Mean Abundance by Treatment Group ===
print(summary_by_treatment, digits = 3)
## # A tibble: 26 × 13
## miRNA N CF_Asp_Neg_Mean_Perc…¹ HC_Mean_Percentage UNSTIM_Mean_Percentage CF_Asp_Neg_SD HC_SD UNSTIM_SD CF_Asp_Neg_SE
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 miR-6… 16 24.3 NA 45.0 14.6 NA 7.79 3.66
## 2 miR-6… 14 NA 29.7 NA NA 17.9 NA NA
## 3 miR-6… 16 2.83 NA 0.821 2.07 NA 0.334 0.517
## 4 miR-6… 14 NA 1.65 NA NA 0.851 NA NA
## 5 miR-2… 16 7.64 NA 2.81 5.57 NA 1.28 1.39
## 6 miR-2… 14 NA 6.62 NA NA 6.40 NA NA
## 7 miR-2… 16 1.54 NA 1.29 1.71 NA 0.570 0.427
## 8 miR-2… 14 NA 3.33 NA NA 3.34 NA NA
## 9 miR-3… 16 1.71 NA 0.535 2.12 NA 0.257 0.529
## 10 miR-3… 14 NA 0.756 NA NA 0.474 NA NA
## # ℹ 16 more rows
## # ℹ abbreviated name: ¹CF_Asp_Neg_Mean_Percentage
## # ℹ 4 more variables: HC_SE <dbl>, UNSTIM_SE <dbl>, ANOVA_P_Value <dbl>, BH_Adjusted_P <dbl>
write.csv(summary_by_treatment, file.path(output_dir, "major_miRNAs_treatment_summary.csv"),
row.names = FALSE)
cat("\nComprehensive summary saved to: major_miRNAs_treatment_summary.csv\n")
##
## Comprehensive summary saved to: major_miRNAs_treatment_summary.csv
This visualization shows the compositional breakdown for each individual sample, allowing you to see both within-group consistency and between-group differences.
# Prepare data for stacked bar chart
stack_data <- frac_abundance %>%
as.data.frame() %>%
mutate(miRNA = rownames(.)) %>%
pivot_longer(cols = -miRNA, names_to = "sample", values_to = "fraction") %>%
mutate(
miRNA_category = ifelse(miRNA %in% major_mirna_names, miRNA, "Other"),
treatment = sample_mapping[sample, "comparison_7"],
# Relabel CF_Asp_Neg to CF
treatment = recode(treatment, "CF_Asp_Neg" = "CF")
) %>%
group_by(sample, treatment, miRNA_category) %>%
summarise(total_fraction = sum(fraction), .groups = 'drop')
# Set factor levels: most abundant at bottom, "Other" at top
stack_data <- stack_data %>%
mutate(
miRNA_category = factor(miRNA_category,
levels = rev(c(major_mirna_names, "Other"))),
treatment = factor(treatment, levels = c("UNSTIM", "HC", "CF"))
)
# Create color palette: colors for major miRNAs + grey for "Other"
n_colors <- min(n_major, 12)
if (n_major <= 12) {
stack_colors <- rev(c(brewer.pal(n_major, "Paired"), "grey80"))
} else {
base_colors <- brewer.pal(12, "Paired")
stack_colors <- rev(c(rep(base_colors, ceiling(n_major/12))[1:n_major], "grey80"))
}
names(stack_colors) <- rev(c(major_mirna_names, "Other"))
# Create stacked bar plot
p6 <- ggplot(stack_data, aes(x = sample, y = total_fraction, fill = miRNA_category)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_manual(values = stack_colors, name = "miRNA") +
facet_wrap(~ treatment, scales = "free_x", nrow = 1) +
labs(
title = "miRNA Composition by Sample and Treatment",
subtitle = sprintf("Major miRNAs (≥1%%, n=%d) shown individually (~%.1f%% of reads), remaining as 'Other'",
n_major, total_major_fraction * 100),
x = "",
y = "Fractional Abundance"
) +
theme_classic(base_size = 12) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 11),
strip.text = element_text(size = 11, face = "bold"),
strip.background = element_rect(fill = "grey95", color = "grey50"),
legend.position = "bottom",
legend.text = element_text(size = 9, family = "mono"),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11)
) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE, reverse = TRUE))
print(p6)
save_figure(p6, "major_miRNAs_stacked_by_sample", width = 14, height = 8)
## Saved: major_miRNAs_stacked_by_sample .png/.pdf
The three treatment groups show significantly different cumulative distribution functions, indicating that exposure conditions alter the compositional structure of miRNA cargo in extracellular vesicles.
# Prepare data for statistical testing
abundance_by_treatment <- frac_abundance %>%
as.data.frame() %>%
mutate(miRNA = rownames(.)) %>%
pivot_longer(cols = -miRNA, names_to = "sample", values_to = "fraction") %>%
mutate(treatment = sample_mapping[sample, "comparison_7"]) %>%
group_by(treatment, miRNA) %>%
summarise(mean_fraction = mean(fraction, na.rm = TRUE), .groups = 'drop') %>%
group_by(treatment) %>%
arrange(desc(mean_fraction)) %>%
mutate(
rank = row_number(),
cumsum_fraction = cumsum(mean_fraction),
total_fraction = sum(mean_fraction),
cumsum_pct = cumsum_fraction / total_fraction * 100,
log10_fraction = log10(mean_fraction + 1e-8) # Add small constant to avoid log(0)
) %>%
ungroup()
# Perform pairwise Kolmogorov-Smirnov tests
treatments <- c("UNSTIM", "HC", "CF_Asp_Neg")
ks_results <- data.frame()
for(i in 1:(length(treatments)-1)) {
for(j in (i+1):length(treatments)) {
treat1 <- treatments[i]
treat2 <- treatments[j]
data1 <- abundance_by_treatment %>%
filter(treatment == treat1) %>%
pull(mean_fraction)
data2 <- abundance_by_treatment %>%
filter(treatment == treat2) %>%
pull(mean_fraction)
ks_test <- ks.test(data1, data2)
ks_results <- rbind(ks_results, data.frame(
Comparison = paste(treat1, "vs", treat2),
KS_Statistic = round(as.numeric(ks_test$statistic), 4),
P_Value = formatC(ks_test$p.value, format = "e", digits = 3),
Significant = ifelse(ks_test$p.value < 0.05, "Yes", "No")
))
}
}
cat("\n=== Kolmogorov-Smirnov Test Results ===\n")
##
## === Kolmogorov-Smirnov Test Results ===
cat("Tests whether miRNA abundance distributions differ between treatment groups\n\n")
## Tests whether miRNA abundance distributions differ between treatment groups
print(ks_results, row.names = FALSE)
## Comparison KS_Statistic P_Value Significant
## UNSTIM vs HC 0.5655 9.468e-290 Yes
## UNSTIM vs CF_Asp_Neg 0.4710 4.507e-201 Yes
## HC vs CF_Asp_Neg 0.4364 1.061e-172 Yes
# Save results
write.csv(ks_results, file.path(output_dir, "KS_test_results.csv"), row.names = FALSE)
cat("\n\nInterpretation:\n")
##
##
## Interpretation:
cat("All pairwise comparisons show significant distributional differences (p < 0.05).\n")
## All pairwise comparisons show significant distributional differences (p < 0.05).
cat("This indicates that exposure to CF BAL, HC BAL, or no exposure (UNSTIM)\n")
## This indicates that exposure to CF BAL, HC BAL, or no exposure (UNSTIM)
cat("results in fundamentally different compositional structures of miRNA cargo.\n")
## results in fundamentally different compositional structures of miRNA cargo.
These plots reveal that HC (healthy control BAL) has a much broader distributional tail compared to UNSTIM and CF_Asp_Neg, suggesting greater compositional diversity or evenness. CF_Asp_Neg appears compositionally more similar to UNSTIM than to HC.
# Set factor levels for desired order: UNSTIM (top), HC (middle), CF_Asp_Neg (bottom)
abundance_by_treatment <- abundance_by_treatment %>%
mutate(
treatment = recode(treatment, "CF_Asp_Neg" = "CF"),
treatment = factor(treatment, levels = c("UNSTIM", "HC", "CF"))
)
# Create density plots
p7 <- ggplot(abundance_by_treatment, aes(x = log10_fraction, fill = treatment)) +
geom_density(alpha = 0.6, size = 1) +
facet_wrap(~ treatment, ncol = 1) +
labs(
title = "Distribution Density of miRNA Abundances by Treatment",
subtitle = "HC shows broader distributional tail; CF_Asp_Neg more similar to UNSTIM",
x = "Log10(Fractional Abundance)",
y = "Density",
fill = "Treatment"
) +
theme_classic(base_size = 12) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11),
strip.text = element_text(size = 12, face = "bold"),
strip.background = element_rect(fill = "grey95", color = "grey50")
) +
scale_fill_brewer(palette = "Set2")
print(p7)
save_figure(p7, "abundance_density_plots", width = 8, height = 10)
## Saved: abundance_density_plots .png/.pdf
cat("\n\nKey observations from density plots:\n")
##
##
## Key observations from density plots:
cat("- HC (Healthy Control): Broader, more extended tail → greater diversity/evenness\n")
## - HC (Healthy Control): Broader, more extended tail → greater diversity/evenness
cat("- UNSTIM: More concentrated distribution with shorter tail\n")
## - UNSTIM: More concentrated distribution with shorter tail
cat("- CF_Asp_Neg: Similar shape to UNSTIM, suggesting CF exposure produces\n")
## - CF_Asp_Neg: Similar shape to UNSTIM, suggesting CF exposure produces
cat(" a compositional structure more like unstimulated cells than HC-stimulated\n")
## a compositional structure more like unstimulated cells than HC-stimulated
This plot directly visualizes what the Kolmogorov-Smirnov test compares. Larger vertical gaps between curves indicate greater distributional differences.
# Create ECDF plot
p8 <- ggplot(abundance_by_treatment, aes(x = log10_fraction, color = treatment)) +
stat_ecdf(size = 1.2, alpha = 0.8) +
labs(
title = "Empirical Cumulative Distribution Functions",
subtitle = "Direct visualization of Kolmogorov-Smirnov test comparisons\nLarger gaps between curves = more significant differences",
x = "Log10(Fractional Abundance)",
y = "Cumulative Probability",
color = "Treatment"
) +
theme_classic(base_size = 12) +
theme(
legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11)
) +
scale_color_brewer(palette = "Set2") +
scale_y_continuous(labels = percent_format())
print(p8)
save_figure(p8, "ECDF_by_treatment", width = 10, height = 7)
## Saved: ECDF_by_treatment .png/.pdf
Shows how quickly each treatment reaches abundance thresholds. Steeper curves indicate more concentrated distributions where fewer miRNAs dominate.
# Create cumulative abundance curves
p9 <- ggplot(abundance_by_treatment, aes(x = rank, y = cumsum_pct, color = treatment)) +
geom_line(size = 1.2, alpha = 0.8) +
geom_hline(yintercept = c(50, 80, 90, 95), linetype = "dashed", alpha = 0.5) +
scale_x_log10() +
labs(
title = "Cumulative Abundance Curves by Treatment",
subtitle = "How many miRNAs needed to reach X% of total abundance",
x = "Number of Top miRNAs (log scale)",
y = "Cumulative % of Total Abundance",
color = "Treatment"
) +
theme_classic(base_size = 12) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", size = 14)
) +
scale_color_brewer(palette = "Set2") +
scale_y_continuous(labels = percent_format(scale = 1)) +
annotate("text", x = 1000, y = c(50, 80, 90, 95),
label = c("50%", "80%", "90%", "95%"),
hjust = 1, alpha = 0.7, size = 3.5)
print(p9)
save_figure(p9, "cumulative_abundance_by_treatment", width = 10, height = 7)
## Saved: cumulative_abundance_by_treatment .png/.pdf
Ecological diversity metrics provide a complementary perspective on compositional differences. While Section 7 examined distributional shapes, this section quantifies how evenly miRNA reads are distributed across species using established diversity indices.
# Function to calculate Gini coefficient (0 = perfect equality, 1 = complete inequality)
gini_coef <- function(x) {
x <- sort(x[x > 0])
n <- length(x)
if(n == 0) return(NA)
2 * sum((1:n) * x) / (n * sum(x)) - (n + 1) / n
}
# Calculate diversity metrics per sample
# Shannon entropy: H = -Σ(p * ln(p))
shannon_per_sample <- apply(frac_abundance, 2, function(x) {
x <- x[x > 0] # Remove zeros
-sum(x * log(x))
})
# Simpson's diversity (1 - D): probability that two randomly selected reads are different miRNAs
simpson_per_sample <- apply(frac_abundance, 2, function(x) {
1 - sum(x^2)
})
# Effective number of species (Hill number q=1): exponential of Shannon entropy
# Represents the "equivalent" number of equally-abundant miRNAs
effective_species <- exp(shannon_per_sample)
# Richness: number of detected miRNAs per sample
richness_per_sample <- apply(frac_abundance, 2, function(x) sum(x > 0))
# Pielou's evenness: J = H / H_max = H / ln(S)
# Ranges 0-1; higher = more even distribution
pielou_evenness <- shannon_per_sample / log(richness_per_sample)
# Gini coefficient per sample
gini_per_sample <- apply(frac_abundance, 2, gini_coef)
# Create comprehensive diversity data frame
diversity_df <- data.frame(
Sample = names(shannon_per_sample),
Shannon = shannon_per_sample,
Simpson = simpson_per_sample,
Effective_Species = effective_species,
Richness = richness_per_sample,
Pielou_Evenness = pielou_evenness,
Gini = gini_per_sample,
Treatment = sample_mapping[names(shannon_per_sample), "comparison_7"],
row.names = NULL
)
# Set treatment factor levels
diversity_df$Treatment <- factor(diversity_df$Treatment,
levels = c("UNSTIM", "HC", "CF_Asp_Neg"))
# Display individual sample values
cat("\n=== Diversity Metrics by Sample ===\n\n")
##
## === Diversity Metrics by Sample ===
print(diversity_df %>% arrange(Treatment, Sample), digits = 3)
## Sample Shannon Simpson Effective_Species Richness Pielou_Evenness Gini Treatment
## 1 sample_10 2.58 0.738 13.2 1645 0.349 0.978 UNSTIM
## 2 sample_11 3.14 0.783 23.0 2083 0.410 0.927 UNSTIM
## 3 sample_17 2.40 0.686 11.0 1568 0.326 0.979 UNSTIM
## 4 sample_19 2.52 0.712 12.4 1657 0.340 0.979 UNSTIM
## 5 sample_20 2.75 0.747 15.7 1944 0.363 0.977 UNSTIM
## 6 sample_22 2.79 0.723 16.3 2082 0.365 0.960 UNSTIM
## 7 sample_37 2.85 0.764 17.3 2082 0.373 0.968 UNSTIM
## 8 sample_49 4.04 0.907 56.6 2071 0.528 0.951 UNSTIM
## 9 sample_5 3.49 0.837 32.9 1742 0.468 0.958 UNSTIM
## 10 sample_54 2.67 0.727 14.4 2080 0.349 0.972 UNSTIM
## 11 sample_55 4.88 0.920 131.0 2082 0.638 0.768 UNSTIM
## 12 sample_57 2.99 0.792 20.0 2076 0.392 0.972 UNSTIM
## 13 sample_58 2.92 0.766 18.6 2082 0.383 0.965 UNSTIM
## 14 sample_62 2.66 0.744 14.3 1644 0.359 0.976 UNSTIM
## 15 sample_68 2.75 0.747 15.6 2062 0.360 0.975 UNSTIM
## 16 sample_69 2.92 0.797 18.5 1601 0.395 0.972 UNSTIM
## 17 sample_14 3.89 0.920 49.1 1876 0.517 0.963 HC
## 18 sample_18 2.52 0.723 12.4 1692 0.339 0.980 HC
## 19 sample_2 3.72 0.921 41.2 1800 0.496 0.966 HC
## 20 sample_24 3.28 0.829 26.6 1936 0.434 0.971 HC
## 21 sample_26 2.61 0.741 13.6 1758 0.349 0.981 HC
## 22 sample_29 2.74 0.759 15.4 1855 0.363 0.979 HC
## 23 sample_3 4.02 0.934 55.7 1880 0.533 0.959 HC
## 24 sample_33 3.14 0.836 23.1 1701 0.422 0.975 HC
## 25 sample_52 4.25 0.963 70.1 1688 0.572 0.955 HC
## 26 sample_6 3.86 0.928 47.4 1958 0.509 0.967 HC
## 27 sample_63 2.91 0.773 18.3 2082 0.381 0.966 HC
## 28 sample_71 3.69 0.871 40.2 1812 0.492 0.958 HC
## 29 sample_75 3.10 0.811 22.1 1990 0.408 0.975 HC
## 30 sample_9 4.13 0.951 62.3 1844 0.549 0.959 HC
## 31 sample_1 3.05 0.821 21.2 1540 0.416 0.975 CF_Asp_Neg
## 32 sample_16 3.46 0.909 31.7 1899 0.458 0.976 CF_Asp_Neg
## 33 sample_27 3.12 0.828 22.7 2080 0.408 0.972 CF_Asp_Neg
## 34 sample_30 4.96 0.976 142.2 2083 0.649 0.875 CF_Asp_Neg
## 35 sample_31 2.79 0.828 16.3 1830 0.372 0.982 CF_Asp_Neg
## 36 sample_39 2.96 0.841 19.4 1992 0.390 0.979 CF_Asp_Neg
## 37 sample_42 3.00 0.853 20.1 1825 0.399 0.979 CF_Asp_Neg
## 38 sample_45 2.91 0.832 18.4 1756 0.390 0.975 CF_Asp_Neg
## 39 sample_47 3.47 0.886 32.1 2076 0.454 0.966 CF_Asp_Neg
## 40 sample_50 4.05 0.941 57.5 2083 0.530 0.934 CF_Asp_Neg
## 41 sample_51 4.49 0.964 89.3 2083 0.588 0.941 CF_Asp_Neg
## 42 sample_64 3.03 0.847 20.6 2047 0.397 0.981 CF_Asp_Neg
## 43 sample_66 3.12 0.884 22.7 1503 0.427 0.978 CF_Asp_Neg
## 44 sample_70 2.73 0.783 15.3 1715 0.366 0.983 CF_Asp_Neg
## 45 sample_73 3.31 0.894 27.4 1614 0.448 0.974 CF_Asp_Neg
## 46 sample_76 2.67 0.797 14.4 1668 0.359 0.979 CF_Asp_Neg
# Save sample-level diversity data
write.csv(diversity_df, file.path(output_dir, "diversity_metrics_by_sample.csv"),
row.names = FALSE)
# Summary statistics by treatment
diversity_summary <- diversity_df %>%
group_by(Treatment) %>%
summarise(
n = n(),
Shannon_Mean = mean(Shannon),
Shannon_SD = sd(Shannon),
Simpson_Mean = mean(Simpson),
Simpson_SD = sd(Simpson),
Pielou_Mean = mean(Pielou_Evenness),
Pielou_SD = sd(Pielou_Evenness),
Effective_Species_Mean = mean(Effective_Species),
Effective_Species_SD = sd(Effective_Species),
Gini_Mean = mean(Gini),
Gini_SD = sd(Gini),
.groups = 'drop'
)
cat("\n=== Diversity Metrics Summary by Treatment ===\n\n")
##
## === Diversity Metrics Summary by Treatment ===
print(diversity_summary, digits = 3)
## # A tibble: 3 × 12
## Treatment n Shannon_Mean Shannon_SD Simpson_Mean Simpson_SD Pielou_Mean Pielou_SD Effective_Species_Mean
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 UNSTIM 16 3.02 0.635 0.774 0.0654 0.400 0.0813 26.9
## 2 HC 14 3.42 0.591 0.854 0.0836 0.455 0.0789 35.5
## 3 CF_Asp_Neg 16 3.32 0.649 0.868 0.0575 0.441 0.0820 35.7
## # ℹ 3 more variables: Effective_Species_SD <dbl>, Gini_Mean <dbl>, Gini_SD <dbl>
write.csv(diversity_summary, file.path(output_dir, "diversity_summary_by_treatment.csv"),
row.names = FALSE)
cat("\n\nInterpretation of diversity metrics:\n")
##
##
## Interpretation of diversity metrics:
cat("- Shannon (H'): Higher values = more diverse/even distribution\n")
## - Shannon (H'): Higher values = more diverse/even distribution
cat("- Simpson (1-D): Probability two random reads are different miRNAs (higher = more diverse)\n")
## - Simpson (1-D): Probability two random reads are different miRNAs (higher = more diverse)
cat("- Pielou's J: Evenness relative to maximum possible (0-1 scale)\n")
## - Pielou's J: Evenness relative to maximum possible (0-1 scale)
cat("- Effective Species: Equivalent number of equally-abundant miRNAs\n")
## - Effective Species: Equivalent number of equally-abundant miRNAs
cat("- Gini: Inequality coefficient (higher = more dominated by few miRNAs)\n")
## - Gini: Inequality coefficient (higher = more dominated by few miRNAs)
# ANOVA for Shannon diversity
cat("\n=== ANOVA: Shannon Diversity by Treatment ===\n")
##
## === ANOVA: Shannon Diversity by Treatment ===
shannon_aov <- aov(Shannon ~ Treatment, data = diversity_df)
print(summary(shannon_aov))
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 2 1.311 0.6553 1.668 0.201
## Residuals 43 16.895 0.3929
shannon_pval <- summary(shannon_aov)[[1]][["Pr(>F)"]][1]
if(shannon_pval < 0.05) {
cat("\n=== Tukey HSD Post-hoc: Shannon Diversity ===\n")
shannon_tukey <- TukeyHSD(shannon_aov)
print(shannon_tukey)
}
# ANOVA for Simpson diversity
cat("\n=== ANOVA: Simpson Diversity by Treatment ===\n")
##
## === ANOVA: Simpson Diversity by Treatment ===
simpson_aov <- aov(Simpson ~ Treatment, data = diversity_df)
print(summary(simpson_aov))
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 2 0.08046 0.04023 8.464 0.000795 ***
## Residuals 43 0.20439 0.00475
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
simpson_pval <- summary(simpson_aov)[[1]][["Pr(>F)"]][1]
if(simpson_pval < 0.05) {
cat("\n=== Tukey HSD Post-hoc: Simpson Diversity ===\n")
simpson_tukey <- TukeyHSD(simpson_aov)
print(simpson_tukey)
}
##
## === Tukey HSD Post-hoc: Simpson Diversity ===
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Simpson ~ Treatment, data = diversity_df)
##
## $Treatment
## diff lwr upr p adj
## HC-UNSTIM 0.07993892 0.01869232 0.14118552 0.0077873
## CF_Asp_Neg-UNSTIM 0.09332780 0.03415796 0.15249764 0.0011799
## CF_Asp_Neg-HC 0.01338888 -0.04785773 0.07463548 0.8567601
# ANOVA for Pielou's evenness
cat("\n=== ANOVA: Pielou's Evenness by Treatment ===\n")
##
## === ANOVA: Pielou's Evenness by Treatment ===
pielou_aov <- aov(Pielou_Evenness ~ Treatment, data = diversity_df)
print(summary(pielou_aov))
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 2 0.02471 0.012357 1.891 0.163
## Residuals 43 0.28097 0.006534
pielou_pval <- summary(pielou_aov)[[1]][["Pr(>F)"]][1]
if(pielou_pval < 0.05) {
cat("\n=== Tukey HSD Post-hoc: Pielou's Evenness ===\n")
pielou_tukey <- TukeyHSD(pielou_aov)
print(pielou_tukey)
}
# ANOVA for Gini coefficient
cat("\n=== ANOVA: Gini Coefficient by Treatment ===\n")
##
## === ANOVA: Gini Coefficient by Treatment ===
gini_aov <- aov(Gini ~ Treatment, data = diversity_df)
print(summary(gini_aov))
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 2 0.00154 0.0007703 0.629 0.538
## Residuals 43 0.05263 0.0012239
gini_pval <- summary(gini_aov)[[1]][["Pr(>F)"]][1]
if(gini_pval < 0.05) {
cat("\n=== Tukey HSD Post-hoc: Gini Coefficient ===\n")
gini_tukey <- TukeyHSD(gini_aov)
print(gini_tukey)
}
# Summary of significance
cat("\n=== Summary of Diversity ANOVA Results ===\n")
##
## === Summary of Diversity ANOVA Results ===
diversity_anova_summary <- data.frame(
Index = c("Shannon", "Simpson", "Pielou_Evenness", "Gini"),
ANOVA_P_Value = c(shannon_pval, simpson_pval, pielou_pval, gini_pval),
Significant = c(
ifelse(shannon_pval < 0.05, "Yes", "No"),
ifelse(simpson_pval < 0.05, "Yes", "No"),
ifelse(pielou_pval < 0.05, "Yes", "No"),
ifelse(gini_pval < 0.05, "Yes", "No")
)
)
print(diversity_anova_summary, row.names = FALSE)
## Index ANOVA_P_Value Significant
## Shannon 0.200650358 No
## Simpson 0.000795274 Yes
## Pielou_Evenness 0.163253636 No
## Gini 0.537732140 No
write.csv(diversity_anova_summary, file.path(output_dir, "diversity_ANOVA_results.csv"),
row.names = FALSE)
# Create Shannon diversity boxplot
p_shannon <- ggplot(diversity_df, aes(x = Treatment, y = Shannon, fill = Treatment)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.6) +
geom_jitter(width = 0.15, alpha = 0.7, size = 3) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 4,
fill = "red", color = "darkred") +
scale_fill_brewer(palette = "Set2", labels = c("CF_Asp_Neg" = "CF")) +
scale_x_discrete(labels = c("CF_Asp_Neg" = "CF")) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Shannon Diversity of miRNA Cargo by Treatment",
subtitle = "Higher values indicate more even distribution across miRNA species\nRed diamonds = group means",
x = "Treatment",
y = "Shannon Diversity Index (H')"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
legend.position = "none"
)
# Add significance annotation if ANOVA is significant
if(shannon_pval < 0.05) {
# Get Tukey results for annotation
tukey_df <- as.data.frame(shannon_tukey$Treatment)
tukey_df$comparison <- rownames(tukey_df)
sig_comparisons <- tukey_df[tukey_df$`p adj` < 0.05, ]
if(nrow(sig_comparisons) > 0) {
p_shannon <- p_shannon +
annotate("text", x = 2, y = max(diversity_df$Shannon) + 0.1,
label = sprintf("ANOVA p = %.4f", shannon_pval),
size = 3.5, fontface = "italic")
}
}
print(p_shannon)
save_figure(p_shannon, "shannon_diversity_by_treatment", width = 8, height = 6)
## Saved: shannon_diversity_by_treatment .png/.pdf
# Reshape for faceted plot
diversity_long <- diversity_df %>%
dplyr::select(Sample, Treatment, Shannon, Simpson, Pielou_Evenness, Gini) %>%
pivot_longer(cols = c(Shannon, Simpson, Pielou_Evenness, Gini),
names_to = "Index", values_to = "Value") %>%
mutate(Index = factor(Index,
levels = c("Shannon", "Simpson", "Pielou_Evenness", "Gini"),
labels = c("Shannon (H')", "Simpson (1-D)",
"Pielou Evenness (J)", "Gini Coefficient")))
# Create multi-panel plot
p_diversity_panel <- ggplot(diversity_long, aes(x = Treatment, y = Value, fill = Treatment)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.6) +
geom_jitter(width = 0.15, alpha = 0.6, size = 2) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
fill = "red", color = "darkred") +
facet_wrap(~ Index, scales = "free_y", ncol = 4) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Diversity and Evenness Metrics by Treatment",
subtitle = "Multiple indices capture different aspects of compositional diversity | Red diamonds = group means",
x = "Treatment",
y = "Index Value"
) +
theme_bw(base_size = 11) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
strip.text = element_text(face = "bold", size = 11),
strip.background = element_rect(fill = "grey95"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
print(p_diversity_panel)
save_figure(p_diversity_panel, "diversity_panel_by_treatment", width = 14, height = 5)
## Saved: diversity_panel_by_treatment .png/.pdf
The effective number of species (Hill number q=1) provides an intuitive interpretation: how many equally-abundant miRNAs would produce the same Shannon diversity?
# Create effective species boxplot
p_effective <- ggplot(diversity_df, aes(x = Treatment, y = Effective_Species, fill = Treatment)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA, width = 0.6) +
geom_jitter(width = 0.15, alpha = 0.7, size = 3) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 4,
fill = "red", color = "darkred") +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Effective Number of miRNA Species by Treatment",
subtitle = "Equivalent number of equally-abundant miRNAs (exp of Shannon entropy)",
x = "Treatment",
y = "Effective Number of Species"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
legend.position = "none"
)
print(p_effective)
save_figure(p_effective, "effective_species_by_treatment", width = 8, height = 6)
## Saved: effective_species_by_treatment .png/.pdf
# Report means
cat("\n=== Effective Number of Species by Treatment ===\n")
##
## === Effective Number of Species by Treatment ===
diversity_df %>%
group_by(Treatment) %>%
summarise(
Mean = mean(Effective_Species),
SD = sd(Effective_Species),
Median = median(Effective_Species),
.groups = 'drop'
) %>%
print(digits = 2)
## # A tibble: 3 × 4
## Treatment Mean SD Median
## <fct> <dbl> <dbl> <dbl>
## 1 UNSTIM 26.9 29.9 16.8
## 2 HC 35.5 19.3 33.4
## 3 CF_Asp_Neg 35.7 34.3 21.9
Rank-abundance curves show how steeply abundance drops off with rank. Steeper curves indicate communities more dominated by a few species; flatter curves indicate more even distributions.
# Calculate mean fractional abundance per miRNA for each treatment
rank_abundance_data <- frac_abundance %>%
as.data.frame() %>%
mutate(miRNA = rownames(.)) %>%
pivot_longer(-miRNA, names_to = "Sample", values_to = "Fraction") %>%
mutate(Treatment = sample_mapping[Sample, "comparison_7"]) %>%
group_by(Treatment, miRNA) %>%
summarise(Mean_Fraction = mean(Fraction), .groups = "drop") %>%
group_by(Treatment) %>%
arrange(desc(Mean_Fraction)) %>%
mutate(Rank = row_number()) %>%
ungroup() %>%
mutate(Treatment = factor(Treatment, levels = c("UNSTIM", "HC", "CF_Asp_Neg")))
# Create rank-abundance plot
p_rank_abundance <- ggplot(rank_abundance_data,
aes(x = Rank, y = Mean_Fraction, color = Treatment)) +
geom_line(size = 1, alpha = 0.8) +
scale_y_log10(labels = percent_format(accuracy = 0.01)) +
scale_x_log10() +
scale_color_brewer(palette = "Set2") +
labs(
title = "Rank-Abundance Curves by Treatment",
subtitle = "Steeper curves indicate more uneven distributions dominated by few miRNAs",
x = "Rank (log scale)",
y = "Mean Fractional Abundance (log scale)",
color = "Treatment"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
legend.position = "bottom"
)
print(p_rank_abundance)
save_figure(p_rank_abundance, "rank_abundance_curves", width = 10, height = 7)
## Saved: rank_abundance_curves .png/.pdf
# Zoom to top 100 miRNAs
p_rank_abundance_zoom <- ggplot(rank_abundance_data %>% filter(Rank <= 100),
aes(x = Rank, y = Mean_Fraction, color = Treatment)) +
geom_line(size = 1.2, alpha = 0.8) +
geom_point(size = 1.5, alpha = 0.6) +
scale_y_log10(labels = percent_format(accuracy = 0.01)) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Rank-Abundance Curves: Top 100 miRNAs",
subtitle = "Focused view of most abundant miRNAs across treatment groups",
x = "Rank",
y = "Mean Fractional Abundance (log scale)",
color = "Treatment"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
legend.position = "bottom"
)
print(p_rank_abundance_zoom)
save_figure(p_rank_abundance_zoom, "rank_abundance_curves_top100", width = 10, height = 7)
## Saved: rank_abundance_curves_top100 .png/.pdf
cat("\n")
cat("================================================================================\n")
## ================================================================================
cat(" DIVERSITY ANALYSIS INTERPRETATION \n")
## DIVERSITY ANALYSIS INTERPRETATION
cat("================================================================================\n\n")
## ================================================================================
# Calculate and report key findings
diversity_means <- diversity_df %>%
group_by(Treatment) %>%
summarise(
Shannon = mean(Shannon),
Pielou = mean(Pielou_Evenness),
Gini = mean(Gini),
Effective = mean(Effective_Species),
.groups = 'drop'
)
cat("KEY FINDINGS:\n\n")
## KEY FINDINGS:
cat("1. Shannon Diversity (H'):\n")
## 1. Shannon Diversity (H'):
for(i in 1:nrow(diversity_means)) {
cat(sprintf(" %s: %.3f\n", diversity_means$Treatment[i], diversity_means$Shannon[i]))
}
## UNSTIM: 3.021
## HC: 3.418
## CF_Asp_Neg: 3.320
cat(sprintf(" Order: %s\n\n",
paste(diversity_means$Treatment[order(diversity_means$Shannon, decreasing = TRUE)],
collapse = " > ")))
## Order: HC > CF_Asp_Neg > UNSTIM
cat("2. Pielou's Evenness (J):\n")
## 2. Pielou's Evenness (J):
for(i in 1:nrow(diversity_means)) {
cat(sprintf(" %s: %.3f\n", diversity_means$Treatment[i], diversity_means$Pielou[i]))
}
## UNSTIM: 0.400
## HC: 0.455
## CF_Asp_Neg: 0.441
cat(sprintf(" Order: %s\n\n",
paste(diversity_means$Treatment[order(diversity_means$Pielou, decreasing = TRUE)],
collapse = " > ")))
## Order: HC > CF_Asp_Neg > UNSTIM
cat("3. Gini Coefficient (inequality):\n")
## 3. Gini Coefficient (inequality):
for(i in 1:nrow(diversity_means)) {
cat(sprintf(" %s: %.3f\n", diversity_means$Treatment[i], diversity_means$Gini[i]))
}
## UNSTIM: 0.955
## HC: 0.968
## CF_Asp_Neg: 0.965
cat(sprintf(" Order (most to least concentrated): %s\n\n",
paste(diversity_means$Treatment[order(diversity_means$Gini, decreasing = TRUE)],
collapse = " > ")))
## Order (most to least concentrated): HC > CF_Asp_Neg > UNSTIM
cat("4. Effective Number of Species:\n")
## 4. Effective Number of Species:
for(i in 1:nrow(diversity_means)) {
cat(sprintf(" %s: %.1f miRNAs\n", diversity_means$Treatment[i], diversity_means$Effective[i]))
}
## UNSTIM: 26.9 miRNAs
## HC: 35.5 miRNAs
## CF_Asp_Neg: 35.7 miRNAs
cat("\n--------------------------------------------------------------------------------\n")
##
## --------------------------------------------------------------------------------
cat("BIOLOGICAL INTERPRETATION:\n\n")
## BIOLOGICAL INTERPRETATION:
# Determine the pattern
if(diversity_means$Shannon[diversity_means$Treatment == "HC"] >
diversity_means$Shannon[diversity_means$Treatment == "CF_Asp_Neg"] &&
diversity_means$Shannon[diversity_means$Treatment == "CF_Asp_Neg"] >
diversity_means$Shannon[diversity_means$Treatment == "UNSTIM"]) {
cat("The pattern HC > CF > UNSTIM for miRNA diversity suggests:\n\n")
cat("• Exposure to bronchoalveolar lavage stimulates HMSCs to diversify their\n")
cat(" EV-associated miRNA cargo beyond the baseline (UNSTIM) state.\n\n")
cat("• Healthy BAL induces the GREATEST diversification, suggesting that healthy\n")
cat(" lung signals trigger a broad, complex cellular response in HMSCs.\n\n")
cat("• CF BAL produces an ATTENUATED diversification response—miRNA cargo\n")
cat(" becomes more diverse than unstimulated cells but less diverse than\n")
cat(" healthy BAL-exposed cells.\n\n")
cat("• This 'diversity deficit' in CF-exposed EVs may reflect:\n")
cat(" (a) Impaired HMSC responsiveness to disease-associated signals\n")
cat(" (b) A fundamentally different signaling landscape in CF lungs that fails\n")
cat(" to trigger full cargo diversification\n")
cat(" (c) Presence of inhibitory factors in CF BAL that constrain EV loading\n\n")
cat("• THERAPEUTIC IMPLICATION: If therapeutic efficacy requires diverse miRNA\n")
cat(" cargo for pleiotropic effects, CF-conditioned EVs may be less effective\n")
cat(" not because they lack specific miRNAs, but because they lack\n")
cat(" compositional complexity.\n")
} else {
cat("The observed diversity pattern differs from the expected HC > CF > UNSTIM.\n")
cat("Please examine the summary statistics above to interpret the biological\n")
cat("significance of the observed pattern.\n")
}
## The pattern HC > CF > UNSTIM for miRNA diversity suggests:
##
## • Exposure to bronchoalveolar lavage stimulates HMSCs to diversify their
## EV-associated miRNA cargo beyond the baseline (UNSTIM) state.
##
## • Healthy BAL induces the GREATEST diversification, suggesting that healthy
## lung signals trigger a broad, complex cellular response in HMSCs.
##
## • CF BAL produces an ATTENUATED diversification response—miRNA cargo
## becomes more diverse than unstimulated cells but less diverse than
## healthy BAL-exposed cells.
##
## • This 'diversity deficit' in CF-exposed EVs may reflect:
## (a) Impaired HMSC responsiveness to disease-associated signals
## (b) A fundamentally different signaling landscape in CF lungs that fails
## to trigger full cargo diversification
## (c) Presence of inhibitory factors in CF BAL that constrain EV loading
##
## • THERAPEUTIC IMPLICATION: If therapeutic efficacy requires diverse miRNA
## cargo for pleiotropic effects, CF-conditioned EVs may be less effective
## not because they lack specific miRNAs, but because they lack
## compositional complexity.
cat("\n================================================================================\n")
##
## ================================================================================
# Export fractional abundance for major miRNAs
major_frac_export <- frac_abundance[major_mirna_names, ] %>%
as.data.frame() %>%
mutate(miRNA = rownames(.)) %>%
dplyr::select(miRNA, everything())
write.csv(major_frac_export, file.path(output_dir, "major_miRNAs_fractional_abundances.csv"),
row.names = FALSE)
# Export all fractional abundances (sorted by mean)
all_frac_export <- frac_abundance %>%
as.data.frame() %>%
mutate(
miRNA = rownames(.),
Mean_Fraction = rowMeans(.)
) %>%
arrange(desc(Mean_Fraction)) %>%
dplyr::select(miRNA, Mean_Fraction, everything())
write.csv(all_frac_export, file.path(output_dir, "all_miRNA_fractional_abundances.csv"),
row.names = FALSE)
cat("\n=== Data Files Exported ===\n\n")
##
## === Data Files Exported ===
cat("Compositional Analysis:\n")
## Compositional Analysis:
cat(" 1. major_miRNAs_1pct_summary.csv\n")
## 1. major_miRNAs_1pct_summary.csv
cat(" 2. major_miRNAs_fractional_abundances.csv\n")
## 2. major_miRNAs_fractional_abundances.csv
cat(" 3. all_miRNA_fractional_abundances.csv\n")
## 3. all_miRNA_fractional_abundances.csv
cat(" 4. major_miRNAs_ANOVA_results.csv\n")
## 4. major_miRNAs_ANOVA_results.csv
cat(" 5. major_miRNAs_posthoc_comparisons.csv (if significant differences found)\n")
## 5. major_miRNAs_posthoc_comparisons.csv (if significant differences found)
cat(" 6. major_miRNAs_treatment_summary.csv\n")
## 6. major_miRNAs_treatment_summary.csv
cat(" 7. KS_test_results.csv\n\n")
## 7. KS_test_results.csv
cat("Diversity Analysis:\n")
## Diversity Analysis:
cat(" 8. diversity_metrics_by_sample.csv\n")
## 8. diversity_metrics_by_sample.csv
cat(" 9. diversity_summary_by_treatment.csv\n")
## 9. diversity_summary_by_treatment.csv
cat(" 10. diversity_ANOVA_results.csv\n\n")
## 10. diversity_ANOVA_results.csv
cat("=== Figures Created ===\n\n")
## === Figures Created ===
cat("Compositional Visualizations:\n")
## Compositional Visualizations:
cat(" 1. major_miRNAs_barplot.png/.pdf\n")
## 1. major_miRNAs_barplot.png/.pdf
cat(" 2. cumulative_abundance_plot.png/.pdf\n")
## 2. cumulative_abundance_plot.png/.pdf
cat(" 3. composition_stacked_bar.png/.pdf\n")
## 3. composition_stacked_bar.png/.pdf
cat(" 4. major_miRNAs_heatmap_by_sample.png/.pdf\n")
## 4. major_miRNAs_heatmap_by_sample.png/.pdf
cat(" 5. major_miRNAs_boxplot_by_group.png/.pdf\n")
## 5. major_miRNAs_boxplot_by_group.png/.pdf
cat(" 6. major_miRNAs_significant_differences.png/.pdf (if significant)\n")
## 6. major_miRNAs_significant_differences.png/.pdf (if significant)
cat(" 7. major_miRNAs_stacked_by_sample.png/.pdf\n")
## 7. major_miRNAs_stacked_by_sample.png/.pdf
cat(" 8. abundance_density_plots.png/.pdf\n")
## 8. abundance_density_plots.png/.pdf
cat(" 9. ECDF_by_treatment.png/.pdf\n")
## 9. ECDF_by_treatment.png/.pdf
cat(" 10. cumulative_abundance_by_treatment.png/.pdf\n\n")
## 10. cumulative_abundance_by_treatment.png/.pdf
cat("Diversity Visualizations:\n")
## Diversity Visualizations:
cat(" 11. shannon_diversity_by_treatment.png/.pdf\n")
## 11. shannon_diversity_by_treatment.png/.pdf
cat(" 12. diversity_panel_by_treatment.png/.pdf\n")
## 12. diversity_panel_by_treatment.png/.pdf
cat(" 13. effective_species_by_treatment.png/.pdf\n")
## 13. effective_species_by_treatment.png/.pdf
cat(" 14. rank_abundance_curves.png/.pdf\n")
## 14. rank_abundance_curves.png/.pdf
cat(" 15. rank_abundance_curves_top100.png/.pdf\n")
## 15. rank_abundance_curves_top100.png/.pdf
cat("\n=== All outputs saved to: ===\n")
##
## === All outputs saved to: ===
cat(output_dir, "\n")
## /Users/d20605c/Documents/MSC-EV-miRNA-CF-BAL/results/01_compositional
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] org.Hs.eg.db_3.22.0 AnnotationDbi_1.72.0 qvalue_2.42.0 ReactomePA_1.54.0
## [5] enrichplot_1.30.4 clusterProfiler_4.18.4 multiMiR_1.32.0 DESeq2_1.50.2
## [9] SummarizedExperiment_1.40.0 Biobase_2.70.0 MatrixGenerics_1.22.0 matrixStats_1.5.0
## [13] GenomicRanges_1.62.1 Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
## [17] BiocGenerics_0.56.0 generics_0.1.4 flextable_0.9.10 officer_0.7.2
## [21] ggrepel_0.9.6 scales_1.4.0 RColorBrewer_1.1-3 ggplot2_4.0.1
## [25] tidyr_1.3.2 readr_2.1.6 dplyr_1.1.4 readxl_1.4.5
## [29] here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] splines_4.5.2 bitops_1.0-9 ggplotify_0.1.3 tibble_3.3.0 R.oo_1.27.1
## [6] cellranger_1.1.0 polyclip_1.10-7 graph_1.88.1 XML_3.99-0.20 lifecycle_1.0.4
## [11] rprojroot_2.1.1 lattice_0.22-7 vroom_1.6.7 MASS_7.3-65 magrittr_2.0.4
## [16] sass_0.4.10 rmarkdown_2.30 jquerylib_0.1.4 yaml_2.3.12 ggtangle_0.0.9
## [21] zip_2.3.3 askpass_1.2.1 cowplot_1.2.0 DBI_1.2.3 abind_1.4-8
## [26] purrr_1.2.0 R.utils_2.13.0 ggraph_2.2.2 RCurl_1.98-1.17 yulab.utils_0.2.3
## [31] tweenr_2.0.3 rappdirs_0.3.3 gdtools_0.4.4 tidytree_0.4.6 reactome.db_1.94.0
## [36] codetools_0.2-20 DelayedArray_0.36.0 DOSE_4.4.0 xml2_1.5.1 ggforce_0.5.0
## [41] tidyselect_1.2.1 aplot_0.2.9 farver_2.1.2 viridis_0.6.5 jsonlite_2.0.0
## [46] tidygraph_1.3.1 systemfonts_1.3.1 tools_4.5.2 ggnewscale_0.5.2 treeio_1.34.0
## [51] ragg_1.5.0 Rcpp_1.1.0 glue_1.8.0 gridExtra_2.3 SparseArray_1.10.8
## [56] xfun_0.55 withr_3.0.2 BiocManager_1.30.27 fastmap_1.2.0 openssl_2.3.4
## [61] digest_0.6.39 R6_2.6.1 gridGraphics_0.5-1 textshaping_1.0.4 GO.db_3.22.0
## [66] RSQLite_2.4.5 R.methodsS3_1.8.2 utf8_1.2.6 fontLiberation_0.1.0 renv_1.1.5
## [71] data.table_1.18.0 graphlayouts_1.2.2 httr_1.4.7 htmlwidgets_1.6.4 S4Arrays_1.10.1
## [76] scatterpie_0.2.6 graphite_1.56.0 pkgconfig_2.0.3 gtable_0.3.6 blob_1.2.4
## [81] S7_0.2.1 XVector_0.50.0 htmltools_0.5.9 fontBitstreamVera_0.1.1 fgsea_1.36.0
## [86] png_0.1-8 ggfun_0.2.0 knitr_1.51 tzdb_0.5.0 reshape2_1.4.5
## [91] uuid_1.2-1 nlme_3.1-168 cachem_1.1.0 stringr_1.6.0 parallel_4.5.2
## [96] pillar_1.11.1 grid_4.5.2 vctrs_0.6.5 tidydr_0.0.6 cluster_2.1.8.1
## [101] evaluate_1.0.5 cli_3.6.5 locfit_1.5-9.12 compiler_4.5.2 rlang_1.1.6
## [106] crayon_1.5.3 labeling_0.4.3 plyr_1.8.9 fs_1.6.6 ggiraph_0.9.2
## [111] stringi_1.8.7 viridisLite_0.4.2 BiocParallel_1.44.0 Biostrings_2.78.0 lazyeval_0.2.2
## [116] GOSemSim_2.36.0 fontquiver_0.2.1 Matrix_1.7-4 hms_1.1.4 patchwork_1.3.2
## [121] bit64_4.6.0-1 KEGGREST_1.50.0 igraph_2.2.1 memoise_2.0.1 bslib_0.9.0
## [126] ggtree_4.0.3 fastmatch_1.1-6 bit_4.6.0 ape_5.8-1 gson_0.1.0
This compositional and diversity analysis demonstrates that:
13 miRNAs individually account for ≥1% of total reads, collectively representing 70.4% of all miRNA reads in extracellular vesicles from hMSCs.
The most abundant miRNA is miR-6126, representing 33.17% of total reads.
The remaining 2070 detected miRNAs collectively represent only 29.6% of total reads.
This highly skewed distribution is typical of miRNA populations in extracellular vesicles, where a small number of miRNAs dominate the cargo.
The stacked bar plot (Section 6.6) shows compositional consistency within treatment groups and reveals compositional differences between groups at the individual sample level.
Statistical testing reveals which major miRNAs differ between treatment groups (Section 6.3):
Treatment groups show significantly different compositional structures (Section 7):
Diversity analysis provides quantitative metrics for compositional complexity (Section 8):
The analysis identifies not only which specific miRNAs are most abundant, but also which show treatment-dependent changes, providing insight into both the stable “core” miRNA cargo and the treatment-responsive miRNA components in hMSC-derived EVs.
Diversity metrics suggest BAL exposure promotes miRNA cargo diversification, with healthy BAL inducing greater diversity than CF BAL. This “diversity deficit” in CF-exposed EVs may have therapeutic implications if compositional complexity is required for pleiotropic effects.
The visualizations and data tables provide multiple perspectives on compositional structure, focusing on biologically relevant miRNAs that individually contribute ≥1% to the total miRNA population in hMSC-derived extracellular vesicles under different exposure conditions.